# non-imputed data
lfq <- read.csv('./data/nonimputed_lfq.csv')
DT::datatable(lfq)
# imputed data
lfq_imp <- read.csv('./data/LFQ_raw_totals_imp.csv')
colnames(lfq_imp) <- gsub('_TOTALS_', '.I.', colnames(lfq_imp))
DT::datatable(lfq_imp)
# Extracting only TOTALS data for knockout
LFQ_KO <- lfq %>% select(contains('KO'))
# Extracting only TOTALS data for wild type
LFQ_WT <- lfq %>% select(contains('WT'))
# Extracting only TOTALS data for knockout
LFQ_KO_imp <- lfq_imp %>% select(contains('KO'))
# Extracting only TOTALS data for wild type
LFQ_WT_imp <- lfq_imp %>% select(contains('WT'))
First of all, we have to check the distribution of our data.
ggarrange(
plothist(LFQ_KO, 'Non-Imputed'),
plothist(LFQ_KO_imp, 'Imputed'),
nrow = 2, ncol = 1
)
As we see on the historam of konckout columns, the data are slightly skewed to the right.
moments::skewness(cbind(LFQ_KO,LFQ_KO_imp), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.6754195 0.6725489 0.5218363 0.5761698 0.5868388 0.5206581
The asymmetry score tells use that we need to make some transformation of our data and confirms the conclusions drawn from the histogram.
ggarrange(
plothist(LFQ_WT, 'Non-Imputed'),
plothist(LFQ_WT_imp, 'Imputed'),
nrow = 2, ncol = 1
)
As we see on the historam of wild type columns, the data are slightly skewed to the right.
moments::skewness(cbind(LFQ_WT,LFQ_WT_imp), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.7211065 0.6383786 0.7043256 0.6285968 0.5813767 0.6579983
The asymmetry score tells use that we have to make some transformation of our data and confirms the conclusions drawn from the histogram.
Based on the graphic below, we will first use the natural logarithm.
LFQ_KO.log <- log(LFQ_KO) # Non-Imputed
LFQ_KO_imp.log <- log(LFQ_KO_imp) # Imputed
The data distributions looks better, but they are still skewed to the right side.
moments::skewness(cbind(LFQ_KO.log,LFQ_KO_imp.log), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.4538071 0.4538777 0.2723878 0.3737451 0.3857815 0.3298938
The skewness score is smaler. This is good, but check another transformation.
LFQ_KO.frac <- 1/LFQ_KO # Non-Imputed
LFQ_KO_imp.frac <- 1/LFQ_KO_imp # Imputed
moments::skewness(cbind(LFQ_KO.frac, LFQ_KO_imp.frac), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## -0.24799683 -0.24994581 -0.02267739 -0.19230177 -0.20508428 -0.15637524
As we see above the plot and scores of the skewness aren’t close to normal distribution. Our result shows us that with simple transformation we can’t transform data and make their distribution closer to the Gaussian.
The transformation is based on the formula:
\[x' = \Bigg\{ \matrix{\frac{x^{\lambda}-1}{\lambda} & \lambda \neq0 \\ log(x) & \lambda = 0}\]
### Non-Imputed ###
# At first we have to calculate the lambda
lambda.22 <- car::powerTransform(LFQ_KO$KO.22)$lambda
lambda.23 <- car::powerTransform(LFQ_KO$KO.23)$lambda
lambda.24 <- car::powerTransform(LFQ_KO$KO.24)$lambda
The lambda is not 0, so the first transformation is
performed.
LFQ_KO.boxcox.22 <- car::bcPower(LFQ_KO$KO.22, lambda.22)
LFQ_KO.boxcox.23 <- car::bcPower(LFQ_KO$KO.23, lambda.23)
LFQ_KO.boxcox.24 <- car::bcPower(LFQ_KO$KO.24, lambda.24)
LFQ_KO.boxcox <- data.frame(LFQ_KO.boxcox.22, LFQ_KO.boxcox.23, LFQ_KO.boxcox.24)
colnames(LFQ_KO.boxcox) <- c('KO.22', 'KO.23', 'KO.24')
### Imputed ###
# At first we have to calculate the lambda
lambda.22 <- car::powerTransform(LFQ_KO_imp$KO.I.22)$lambda
lambda.23 <- car::powerTransform(LFQ_KO_imp$KO.I.23)$lambda
lambda.24 <- car::powerTransform(LFQ_KO_imp$KO.I.24)$lambda
The lambda is not 0, so the first transformation is
performed.
LFQ_KO.boxcox.22 <- car::bcPower(LFQ_KO_imp$KO.I.22, lambda.22)
LFQ_KO.boxcox.23 <- car::bcPower(LFQ_KO_imp$KO.I.23, lambda.23)
LFQ_KO.boxcox.24 <- car::bcPower(LFQ_KO_imp$KO.I.24, lambda.24)
LFQ_KO_imp.boxcox <- data.frame(LFQ_KO.boxcox.22, LFQ_KO.boxcox.23, LFQ_KO.boxcox.24)
colnames(LFQ_KO_imp.boxcox) <- c('KO.I.22', 'KO.I.23', 'KO.I.24')
moments::skewness(cbind(LFQ_KO.boxcox, LFQ_KO_imp.boxcox), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.01691875 0.01817077 -0.01003888 0.04771938 0.05134458 0.03856700
Finally we have got expected results. The distributions are similar to a normal distribution. 🎉
Z-score normalization standardizes data by subtracting the mean and dividing by the standard deviation. This technique transforms data into a distribution with a mean of 0 and a standard deviation of 1.
Formula: \(\tilde{y}_{ij} = \frac{y_ij - \bar{y_j}}{\theta_j}\), where:
LFQ_KO.standard <- as.data.frame(scale(LFQ_KO)) # Non-Imputed
LFQ_KO_imp.standard <- as.data.frame(scale(LFQ_KO_imp)) # Imputed
moments::skewness(cbind(LFQ_KO.standard, LFQ_KO_imp.standard), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.6754195 0.6725489 0.5218363 0.5761698 0.5868388 0.5206581
Unfortunately the standardization didn’t help to bring the distribution closer to the normal dist.
Min-max normalization scales data to a specified range (usually [0, 1]) by subtracting the minimum value and dividing by the range of values.
min_max_norm <- function(df) {
df_no_na <- na.omit(df)
ret <- scale(df_no_na, center = min(df_no_na), scale = max(df_no_na) - min(df_no_na))
df[which(!is.na(df))] <- ret
return(df)
}
LFQ_KO.minmax <- as.data.frame(lapply(LFQ_KO, function(col) min_max_norm(col))) # Non-Imputed
LFQ_KO_imp.minmax <- as.data.frame(lapply(LFQ_KO_imp, function(col) min_max_norm(col))) # Imputed
moments::skewness(cbind(LFQ_KO.minmax, LFQ_KO_imp.minmax), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.6754195 0.6725489 0.5218363 0.5761698 0.5868388 0.5206581
Unfortunately, again the normalization didn’t help. 😒
LFQ_KO.med <- as.data.frame(DescTools::RobScale(LFQ_KO, scale=FALSE)) # Non-Imputed
LFQ_KO_imp.med <- as.data.frame(DescTools::RobScale(LFQ_KO_imp, scale=FALSE)) # Imputed
moments::skewness(cbind(LFQ_KO.med,LFQ_KO_imp.med), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.6754195 0.6725489 0.5218363 0.5761698 0.5868388 0.5206581
LFQ_KO.mad <- as.data.frame(DescTools::RobScale(LFQ_KO)) # Non-Imputed
LFQ_KO_imp.mad <- as.data.frame(DescTools::RobScale(LFQ_KO_imp)) # Imputed
moments::skewness(cbind(LFQ_KO.mad,LFQ_KO_imp.mad), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.6754195 0.6725489 0.5218363 0.5761698 0.5868388 0.5206581
Bove Median and Mad Scaling didn’t give us the expected result.
moments::skewness(cbind(LFQ_KO.lm, LFQ_KO_imp.lm), na.rm=TRUE)
## KO.22 KO.23 KO.24 KO.I.22 KO.I.23 KO.I.24
## 0.6754195 0.6725489 0.5218363 0.5761698 0.5868388 0.5206581
Linear Regression also didn’t give us expected result.
Like earlier we will first use the logarithmic transformation.
LFQ_WT.log <- log(LFQ_WT) # Non-Imputed
LFQ_WT_imp.log <- log(LFQ_WT_imp) # Imputed
moments::skewness(cbind(LFQ_WT.log, LFQ_WT_imp.log), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.5066689 0.4177543 0.4800066 0.4279311 0.3823422 0.4567911
Unfortunantely the result isn’t closer to the Gaussian distribution.
LFQ_WT.frac <- 1/LFQ_WT # Non-Imputed
LFQ_WT_imp.frac <- 1/LFQ_WT_imp # Imputed
As we can see above, the results are the same like for the knockout cell type. With simple methods we can’t make the distribution closer to Gaussian dist.
The transformation is based on the formula:
\[x' = \Bigg\{ \matrix{\frac{x^{\lambda}-1}{\lambda} & \lambda \neq0 \\ log(x) & \lambda = 0}\]
### Non-Imputed ###
# At first we have to calculate the lambda
lambda.22 <- car::powerTransform(LFQ_WT$WT.22)$lambda
lambda.23 <- car::powerTransform(LFQ_WT$WT.23)$lambda
lambda.24 <- car::powerTransform(LFQ_WT$WT.24)$lambda
The lambda is not 0, so the first transformation is
performed.
LFQ_WT.boxcox.22 <- car::bcPower(LFQ_WT$WT.22, lambda.22)
LFQ_WT.boxcox.23 <- car::bcPower(LFQ_WT$WT.23, lambda.23)
LFQ_WT.boxcox.24 <- car::bcPower(LFQ_WT$WT.24, lambda.24)
LFQ_WT.boxcox <- data.frame(LFQ_WT.boxcox.22, LFQ_WT.boxcox.23, LFQ_WT.boxcox.24)
colnames(LFQ_WT.boxcox) <- c('WT.22', 'WT.23', 'WT.24')
Let’s check the imputed data.
### Imputed ###
# At first we have to calculate the lambda
lambda.22 <- car::powerTransform(LFQ_WT_imp$WT.I.22)$lambda
lambda.23 <- car::powerTransform(LFQ_WT_imp$WT.I.23)$lambda
lambda.24 <- car::powerTransform(LFQ_WT_imp$WT.I.24)$lambda
The lambda is not 0, so the first transformation is
performed.
LFQ_WT.boxcox.22 <- car::bcPower(LFQ_WT_imp$WT.I.22, lambda.22)
LFQ_WT.boxcox.23 <- car::bcPower(LFQ_WT_imp$WT.I.23, lambda.23)
LFQ_WT.boxcox.24 <- car::bcPower(LFQ_WT_imp$WT.I.24, lambda.24)
LFQ_WT_imp.boxcox <- data.frame(LFQ_WT.boxcox.22, LFQ_WT.boxcox.23, LFQ_WT.boxcox.24)
colnames(LFQ_WT_imp.boxcox) <- c('WT.I.22', 'WT.I.23', 'WT.I.24')
moments::skewness(cbind(LFQ_WT.boxcox, LFQ_WT_imp.boxcox), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.02095747 0.01822482 0.02082671 0.05750385 0.05404352 0.06835686
And again the Box-Cox Tranformation gave us expected result. The distributions are similar to a normal distribution.
LFQ_WT.standard <- as.data.frame(scale(LFQ_WT)) # Non-Imputed
LFQ_WT_imp.standard <- as.data.frame(scale(LFQ_WT_imp)) # Imputed
moments::skewness(cbind(LFQ_WT.standard, LFQ_WT_imp.standard), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.7211065 0.6383786 0.7043256 0.6285968 0.5813767 0.6579983
Unfortunately the standardization didn’t help to bring the distribution closer to the normal dist.
LFQ_WT.minmax <- as.data.frame(lapply(LFQ_WT, function(col) min_max_norm(col))) # Non-Imputed
LFQ_WT_imp.minmax <- as.data.frame(lapply(LFQ_WT_imp, function(col) min_max_norm(col))) # Imputed
moments::skewness(cbind(LFQ_WT.minmax, LFQ_WT_imp.minmax), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.7211065 0.6383786 0.7043256 0.6285968 0.5813767 0.6579983
Unfortunately, again the normalization didn’t help. 😒
LFQ_WT.med <- as.data.frame(DescTools::RobScale(LFQ_WT, scale = FALSE)) # Non-Imputed
LFQ_WT_imp.med <- as.data.frame(DescTools::RobScale(LFQ_WT_imp, scale = FALSE)) # Imputed
moments::skewness(cbind(LFQ_WT.med,LFQ_WT_imp.med), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.7211065 0.6383786 0.7043256 0.6285968 0.5813767 0.6579983
LFQ_WT.mad <- as.data.frame(DescTools::RobScale(LFQ_WT)) # Non-Imputed
LFQ_WT_imp.mad <- as.data.frame(DescTools::RobScale(LFQ_WT_imp)) # Imputed
moments::skewness(cbind(LFQ_WT.mad,LFQ_WT_imp.mad), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.7211065 0.6383786 0.7043256 0.6285968 0.5813767 0.6579983
moments::skewness(cbind(LFQ_WT.lm, LFQ_WT_imp.lm), na.rm=TRUE)
## WT.22 WT.23 WT.24 WT.I.22 WT.I.23 WT.I.24
## 0.7211065 0.6383786 0.7043256 0.6285968 0.5813767 0.6579983
VSN - Variance Stabilization Normalization
lfq_rglist<- new('RGList', list(
R = as.matrix(LFQ_KO),
G = as.matrix(LFQ_WT)))
lfq_rglist_imp<- new('RGList', list(
R = as.matrix(LFQ_KO_imp),
G = as.matrix(LFQ_WT_imp)))
LFQ.vsn <- justvsn(lfq_rglist)@assayData # Non-Imputed
LFQ_imp.vsn <- justvsn(lfq_rglist_imp)@assayData # Imputed
This transformation didn’t help us too. Furthermore we lost differences between knockout and wild type intensity.
# Non-imputed
moments::skewness(cbind(LFQ.vsn$R, LFQ.vsn$G), na.rm=TRUE)
## KO.22 KO.23 KO.24 WT.22 WT.23 WT.24
## 0.4488821 0.4558896 0.2747223 0.5049867 0.4228609 0.4813897
# Imputed
moments::skewness(cbind(LFQ_imp.vsn$R, LFQ_imp.vsn$G), na.rm=TRUE)
## KO.I.22 KO.I.23 KO.I.24 WT.I.22 WT.I.23 WT.I.24
## 0.5754457 0.5861375 0.5199840 0.6278960 0.5806866 0.6572911
bestNormalize Package for Automatic NormalizationExperiment 22
best.KO.22 <- bestNormalize::bestNormalize(LFQ_KO$KO.22) # Non-Imputed
best.KO_imp.22 <- bestNormalize::bestNormalize(LFQ_KO_imp$KO.I.22) # Imputed
Experiment 23
best.KO.23 <- bestNormalize::bestNormalize(LFQ_KO$KO.23) # Non-Imputed
best.KO_imp.23 <- bestNormalize::bestNormalize(LFQ_KO_imp$KO.I.23) # Imputed
Experiment 24
best.KO.24 <- bestNormalize::bestNormalize(LFQ_KO$KO.24) # Non-Imputed
best.KO_imp.24 <- bestNormalize::bestNormalize(LFQ_KO_imp$KO.I.24) # Imputed
Summary
## Rep_22 Rep_22_imp Rep_23 Rep_23_imp Rep_24 Rep_24_imp
## Method orderNorm orderNorm orderNorm orderNorm yeojohnson orderNorm
Based on the result for each column, the best method to normalize the data is Order Quntile Normalization for all of the samples:
moments::skewness(cbind(LFQ_KO.bestNorm, LFQ_KO_imp.bestNorm), na.rm = TRUE)
## KO.22 KO.23 KO.24 KO.I.22
## 0.000000065283662 0.000000002353212 -1.087327117083625 -0.000000019535102
## KO.I.23 KO.I.24
## -0.000000004631138 -0.000000021849734
Experiment 22
best.WT.22 <- bestNormalize::bestNormalize(LFQ_WT$WT.22) # Non-Imputed
best.WT_imp.22 <- bestNormalize::bestNormalize(LFQ_WT_imp$WT.I.22) # Imputed
Experiment 23
best.WT.23 <- bestNormalize::bestNormalize(LFQ_WT$WT.23) # Non-Imputed
best.WT_imp.23 <- bestNormalize::bestNormalize(LFQ_WT_imp$WT.I.23) # Imputed
Experiment 24
best.WT.24 <- bestNormalize::bestNormalize(LFQ_WT$WT.24) # Non-Imputed
best.WT_imp.24 <- bestNormalize::bestNormalize(LFQ_WT_imp$WT.I.24) # Imputed
Summary
## Rep_22 Rep_22_imp Rep_23 Rep_23_imp Rep_24 Rep_24_imp
## Method orderNorm orderNorm yeojohnson orderNorm orderNorm orderNorm
Like earlier the best method for each column is Order Quantile Normalization.
moments::skewness(cbind(LFQ_WT.bestNorm, LFQ_WT_imp.bestNorm), na.rm = TRUE)
## WT.22 WT.23 WT.24 WT.I.22
## 0.000000255971820 -0.506896422649866 0.000000084736746 -0.000000003968973
## WT.I.23 WT.I.24
## -0.000000045719609 -0.000000016718898
As we can see both KO and WT data have a normal distribution, but we have lost the differences between samples.
protein.groups <- readr::read_tsv('data/proteinGroups.txt', show_col_types = FALSE)
protein.groups <- protein.groups %>% filter(is.na(`Only identified by site`),
is.na(Reverse),
is.na(`Potential contaminant`))
# I needed unique values for each peptide, so I create artificial names `prot_1:5905`
prot.info <- data.frame(protein.groups[,"Peptide sequences"], paste('prot_', 1:nrow(lfq), sep = ''))
LFQ.eig1 <- eig_norm1(lfq, treatment = as.factor(c('KO', 'KO', 'KO', 'WT', 'WT', 'WT')), prot.info = prot.info)
On the plots we can see the result of svd() fxn, which
is:
The SVD decomposition of the matrix as computed by LAPACK,
\[X=UDV',\]
where \(U\) and \(V\) are orthogonal, \(V'\) means V transposed (and conjugated for complex input), and \(D\) is a diagonal matrix with the (non-negative) singular values \(D_{ii}\) in decreasing order. Equivalently, \(D=U'XV\), which is verified in the examples.
The returned value is a list with components:
d - a vector containing the singular values of x, of
length min(n, p), sorted decreasingly.u - a matrix whose columns contain the left singular
vectors of x, present if nu > 0. Dimension c(n, nu).v - a matrix whose columns contain the right singular
vectors of x, present if nv > 0. Dimension c(p, nv).Axis of the plots:
u value from the
svd() fnx.# Performing eig normalization
LFQ.eig_norm <- eig_norm2(LFQ.eig1)
par(mfcol=c(1,2))
boxplot(lfq, las=2, main='Raw intensities')
boxplot(LFQ.eig_norm$norm_m, las=2, main='Normalized intensities')
ggarrange(
plothist(LFQ.eig_norm$norm_m[,1:3], 'Non-Imputed EigenMS'),
plothist(LFQ.eig_norm$norm_m[,4:6], ''),
nrow=2,ncol=1
)
moments::skewness(LFQ.eig_norm$norm_m, na.rm=TRUE)
## KO.22 KO.23 KO.24 WT.22 WT.23 WT.24
## 0.7008182 0.7392194 0.7785731 0.7016820 0.6860371 0.7237159
The eigen normalization perfectly shows the differences between cell type, but distributions are skewed all the time.
# I needed unique values for each peptide, so I create artificial names `prot_1:5905`
prot.info <- data.frame(protein.groups[,"Peptide sequences"], paste('prot_', 1:nrow(lfq_imp), sep = ''))
LFQ_imp.eig1 <- eig_norm1(lfq_imp, treatment = as.factor(c('KO', 'KO', 'KO', 'WT', 'WT', 'WT')), prot.info = prot.info)
## The following object is masked from TREAT (pos = 3):
##
## TREAT
# Performing eig normalization
LFQ_imp.eig_norm <- eig_norm2(LFQ_imp.eig1)
par(mfcol=c(1,2))
boxplot(lfq_imp, las=2, main='Raw intensities')
boxplot(LFQ_imp.eig_norm$norm_m, las=2, main='Normalized intensities')
ggarrange(
plothist(LFQ_imp.eig_norm$norm_m[,1:3], 'Imputed EigenMS'),
plothist(LFQ_imp.eig_norm$norm_m[,4:6], ''),
nrow=2,ncol=1
)
moments::skewness(LFQ_imp.eig_norm$norm_m, na.rm=TRUE)
## KO.I.22 KO.I.23 KO.I.24 WT.I.22 WT.I.23 WT.I.24
## 0.6040135 0.6322023 0.6980207 0.6800863 0.6629387 0.6618507
COEFFICIENT OF VARIATION (CV) / RELATIVE STANDARD DEVIATION (RSD)
This is a way to measure how spread out values are in a dataset relative to the mean. A lower RSD/CV indicates better normalization. It is calculated as:
\(CV = \frac{\sigma}{\mu}\)
where:
As we can see above, the best methods were:
Coefficient of Variation for the rest of the methods have a poor performance.
REPRODUCIBILITY OF BIOLOGICAL REPLICATES
After normalization, biological replicates should group more tightly. You can assess this by measuring the intraclass correlation coefficient (ICC) to see if replicates cluster together.
A guidelines for interpretation by Koo and Li (2016):
All normalization methods worked quite well. The skewness problem was solved with the Box-Cox transformation, and with transformed data we can normalize it with all methods.
The method chosen with bestNormalize gave us a perfect
shape of a distribution, but we lost the main differences between
samples. In my opinion the best result gave the Eigen
Normalization method, because this method kept a characteristic
of a sample and this is very important in the analysis of biological
data.